## MEP Tweets ##
load("alltweets.RData")

# tokenization and cleaning 
alltweets <- alltweets %>% 
  drop_na(text_translated)

alltweets <- alltweets[-c(10613,16538,24731),] # delete docs without topic
alltweets$docnum <- 1:32044

mepcorp <- corpus(alltweets,text_field="text_translated")
 
meptoks <- tokens(mepcorp,
                        remove_punct=T,
                        remove_numbers=T) %>%
  tokens_remove(stopwords("en"))

meptoks <- tokens_remove(meptoks, c("rt", "can", "now", "like", "just", "us", "yes", "something", "must"))

mep_dfm <- dfm(meptoks)
mep_dfm


### Estimating a structural topic model ###

## identify optimal number of topics 
#stm_obj<- convert(mep_dfm,"stm") # UNCOMMENT THIS IF YOU WANT TO RERUN THE TOPIC MODEL

#ksearch<-searchK(stm_obj$documents,stm_obj$vocab,
#                 K = seq(5, 15, by = 2), max.em.its = 10)
#plot(ksearch) ## 13 topics have high held-out likelihood and low residuals

## estimate structural topic model, controlling for country and party group
#mep_stm <- stm(mep_dfm, K=13, seed=2023, 
#               prevalence= ~country + ~eu_party_group,
#               data=docvars(mep_dfm),
#               max.em.its = 1000)
#mep_stm


# merge dataset and topics
#topics <- make.dt(mep_stm)
#save(topics, file = "stmtopics.RData")
load("stmtopics.RData") # load this to skip running the structural topic model

alltweets$docnum <- 1:32044

a <- full_join(alltweets, topics)

# plot topics over time
# make date numeric
a <- tidyr::separate(a, created_at, c("date", "time"), sep = "T")
a$date <- as.Date(a$date)
a$date <- as.numeric(a$date)
a$date <- a$date - 18854 # 15 Aug 2021 is 0 now (Taliban Takeover)

a <- a %>%
  group_by(date) %>%
  mutate(obs = n())

migstm <- a %>%
  group_by(date, obs) %>%
  filter(n()>9) %>%
  summarize(mean = mean(Topic8, na.rm = TRUE)) %>%
  mutate(up = mean + 1.96 * sqrt((mean * (1 - mean)) / obs)) %>% 
  mutate(low = mean - 1.96 * sqrt((mean * (1 - mean)) / obs)) %>% 
  ungroup()

stmplot <- ggplot(migstm, aes(x = date, y = mean)) + 
  geom_line(size = 2, color = "slateblue4") +
  geom_path(aes(x = date, y = up),size = 1, linetype = "dashed") + 
  geom_path(aes(x = date, y = low),size = 1, linetype = "dashed") + 
  xlab("Days") +
  ylab("Prevalence") +
  scale_x_continuous(breaks= c(-15, -10, -5, 0, 5, 10, 15)) +
  geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 1) +
  theme_light(base_size = 25) +
  ggtitle("A: Twitter Structural Topic Model")
stmplot  
########################



### Estimating a seeded lda topic model ###
dict <- dictionary(list(immigration = c("asylum", "ethnic",
                                            "foreign*", "halal",
                                            "immigr*", "integrat*",
                                            "*migrant*", "minority",
                                            "reform", "refug*",
                                            "evac*", "flee*",
                                            "airport", "checkpoint",
                                            "escap*"),
                   parliament = c("vote", "parliament*", "politic*", "elect*", "policy"),
                   climate = c("climate", "green", "clean", "energy", "electricity", "nature"),
                   europe = c("europe", "eu", "german", "france", "spain", "italy", "poland", "greece",
                              "hungary"),
                   democracy = c("law", "democra*", "rights", "government", "freedom", "court", "opposition")))

#alternative seed terms for robustness
#dict <- dictionary(list(immigration = c("asylum", "ethnic",
#                                        "foreign*", "halal",
#                                        "immigr*", "integrat*",
#                                        "*migrant*", "minority",
#                                        "reform", "refug*",
#                                        "evac*", "flee*",
#                                        "airport", "checkpoint",
#                                        "escap*"),
#                        economy = c("trade","economy","fiscal","monetary","tariff",
#                                       "export","import","deficit","recovery","investment"),
#                        climate = c("climate","sustainability","emissions","renewable",
#                                    "biodiversity","environment","action","green","carbon","ParisAgreement"),
#                        healthcare = c("healthcare","publichealth","vaccination","access",
#                                   "pandemic","reform","pharmaceutical","funding",
#                                   "infrastructure","equity")))


#set.seed(2023) # UNCOMMENT THIS IF YOU WANT TO RERUN THE TOPIC MODEL
#topicslda <- textmodel_seededlda(mep_dfm, dict, residual = TRUE)

#top20 <- seededlda::terms(topicslda, 20)
#top20  

#ldaframe <- as.data.frame(topicslda$theta)
#save(ldaframe, file = "ldaframe.RData")
load("ldaframe.RData") # load this to skip running the seeded topic model
ldaframe$docnum <- 1:32044

b <- full_join(alltweets, ldaframe)

b <- tidyr::separate(b, created_at, c("date", "time"), sep = "T")
b$date <- as.Date(b$date)
b$date <- as.numeric(b$date)
b$date <- b$date - 18854 # 15 Aug 2021 is 0 now (Taliban Takeover) 

b <- b %>%
  group_by(date) %>%
  mutate(obs = n())


miglda <- b %>%
  group_by(date, obs) %>%
  filter(n()>9) %>%
  summarize(mean = mean(immigration, na.rm = TRUE)) %>%
  mutate(up = mean + 1.96 * sqrt((mean * (1 - mean)) / obs)) %>% 
  mutate(low = mean - 1.96 * sqrt((mean * (1 - mean)) / obs)) %>% 
  ungroup()

seededldaplot <- ggplot(miglda, aes(x = date, y = mean)) + 
  geom_line(size = 2, color = "slateblue4") +
  geom_path(aes(x = date, y = up),size = 1, linetype = "dashed") + 
  geom_path(aes(x = date, y = low),size = 1, linetype = "dashed") + 
  xlab("Days") +
  ylab("Prevalence") +
  scale_x_continuous(breaks= c(-15, -10, -5, 0, 5, 10, 15)) +
  geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 1) +
  theme_light(base_size = 25) +
  ggtitle("B: Twitter Seeded LDA Topic Model")
seededldaplot
#####################




## Factiva Immigration Topic Prevalence ##
load("factivatopic.RData")

factivatopic$up <-  factivatopic$ImmigrationProp + 
  1.96 * sqrt((factivatopic$ImmigrationProp * 
                 (1 - factivatopic$ImmigrationProp)) / factivatopic$AllArticles)
factivatopic$low <-  factivatopic$ImmigrationProp - 
  1.96 * sqrt((factivatopic$ImmigrationProp * 
                 (1 - factivatopic$ImmigrationProp)) / factivatopic$AllArticles)


factivaplot <- ggplot(factivatopic, aes(x = Day, y = ImmigrationProp, group = 1)) + 
  geom_path(size = 2, color = "slateblue4") +
  geom_path(aes(x = Day, y = up),size = 1, linetype = "dashed") + 
  geom_path(aes(x = Day, y = low),size = 1, linetype = "dashed") + 
  scale_x_continuous(breaks= c(-15, -10, -5, 0, 5, 10, 15)) +
  xlab("Days") +
  ylab("Prevalence") +
  geom_vline(xintercept = 0, color = "black", linetype = "dashed", size = 1) +
  theme_light(base_size = 25) +
  ggtitle("D: Factiva Topic Classification")
factivaplot



